Note: if you use BayesPrism in published research, please cite:

Chu, T., Wang, Z., Pe’er, D. Danko, C. G. Cell type and gene expression deconvolution with BayesPrism enables Bayesian integrative analysis across bulk and single-cell RNA sequencing in oncology. Nat Cancer 3, 505–517 (2022). https://doi.org/10.1038/s43018-022-00356-3

Introduction

BayesPrism performs cell type and gene expression deconvolution for bulk RNA-seq (and spatial transcriptomics) using scRNA-seq of samples collected from matched or similar tissue type. It treats the scRNA-seq as prior information, and estimates \(\mathrm{P}(\theta, Z | X, \phi)\) , i.e. the joint posterior distribution of cell type fraction \(\theta\) and cell type-specific gene expression \(Z\) in each bulk conditional on the reference \(\phi\) and each observed bulk \(X\).

We will demonstrate how to use BayesPrism to deconvolve the bulk RNA-seq dataset TCGA-GBM, using the scRNA-seq dataset collected from 8 hold-out cohorts by Yuan et al.. (https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-018-0567-9) .

Load the BayesPrism package

suppressWarnings(library(BayesPrism))
#> Loading required package: snowfall
#> Loading required package: snow
#> Loading required package: NMF
#> Loading required package: pkgmaker
#> Loading required package: registry
#> Loading required package: rngtools
#> Loading required package: cluster
#> NMF - BioConductor layer [OK] | Shared memory capabilities [NO: bigmemory] | Cores 15/16
#>   To enable shared memory capabilities, try: install.extras('
#> NMF
#> ')

Load the dataset

The rdata file is stored at "your_git_repository/tutorial.dat/tutorial.gbm.rdata"

load("../tutorial.dat/tutorial.gbm.rdata")
ls()
#> [1] "bk.dat"            "cell.state.labels" "cell.type.labels"  "sc.dat"
dim(bk.dat)
#> [1]   169 60483
head(rownames(bk.dat))
#> [1] "TCGA-06-2563-01A-01R-1849-01" "TCGA-06-0749-01A-01R-1849-01" "TCGA-06-5418-01A-01R-1849-01" "TCGA-06-0211-01B-01R-1849-01" "TCGA-19-2625-01A-01R-1850-01" "TCGA-19-4065-02A-11R-2005-01"
head(colnames(bk.dat))
#> [1] "ENSG00000000003.13" "ENSG00000000005.5"  "ENSG00000000419.11" "ENSG00000000457.12" "ENSG00000000460.15" "ENSG00000000938.11"

The rdata file contains four objects required for running BayesPrism.

dim(bk.dat)
#> [1]   169 60483
head(rownames(bk.dat))
#> [1] "TCGA-06-2563-01A-01R-1849-01" "TCGA-06-0749-01A-01R-1849-01" "TCGA-06-5418-01A-01R-1849-01" "TCGA-06-0211-01B-01R-1849-01" "TCGA-19-2625-01A-01R-1850-01" "TCGA-19-4065-02A-11R-2005-01"
head(colnames(bk.dat))
#> [1] "ENSG00000000003.13" "ENSG00000000005.5"  "ENSG00000000419.11" "ENSG00000000457.12" "ENSG00000000460.15" "ENSG00000000938.11"
dim(sc.dat)
#> [1] 23793 60294
head(rownames(sc.dat))
#> [1] "PJ016.V3" "PJ016.V4" "PJ016.V5" "PJ016.V6" "PJ016.V7" "PJ016.V8"
head(colnames(sc.dat))
#> [1] "ENSG00000130876.10" "ENSG00000134438.9"  "ENSG00000269696.1"  "ENSG00000230393.1"  "ENSG00000266744.1"  "ENSG00000202281.1"
sort(table(cell.type.labels))
#> cell.type.labels
#>       tcell       oligo    pericyte endothelial     myeloid       tumor 
#>          67         160         489         492        2505       20080
sort(table(cell.state.labels))
#> cell.state.labels
#> PJ017-tumor-6 PJ032-tumor-5     myeloid_8 PJ032-tumor-4 PJ032-tumor-3 PJ017-tumor-5         tcell PJ032-tumor-2 PJ030-tumor-5     myeloid_7 PJ035-tumor-8 PJ017-tumor-4 PJ017-tumor-3 PJ017-tumor-2 PJ017-tumor-0 PJ017-tumor-1 PJ018-tumor-5     myeloid_6     myeloid_5 PJ035-tumor-7         oligo PJ048-tumor-8 PJ032-tumor-1 PJ032-tumor-0 PJ048-tumor-7 PJ025-tumor-9 PJ035-tumor-6 PJ048-tumor-6 PJ016-tumor-6 PJ018-tumor-4     myeloid_4 PJ048-tumor-5 PJ048-tumor-4 PJ016-tumor-5 PJ025-tumor-8 PJ048-tumor-3 PJ035-tumor-5 PJ030-tumor-4 PJ018-tumor-3 PJ016-tumor-4 PJ025-tumor-7     myeloid_3 PJ035-tumor-4     myeloid_2 PJ025-tumor-6 PJ018-tumor-2 PJ030-tumor-3 PJ016-tumor-3 PJ025-tumor-5 PJ030-tumor-2 PJ018-tumor-1 PJ035-tumor-3 PJ048-tumor-2 PJ018-tumor-0 PJ048-tumor-1 PJ035-tumor-2 PJ035-tumor-1 PJ016-tumor-2 PJ030-tumor-1      pericyte   endothelial PJ035-tumor-0 PJ025-tumor-4     myeloid_1 PJ048-tumor-0     myeloid_0 PJ030-tumor-0 PJ025-tumor-3 PJ016-tumor-1 PJ016-tumor-0 PJ025-tumor-2 PJ025-tumor-1 PJ025-tumor-0 
#>            22            41            49            57            62            64            67            72            73            75            81            83            89           101           107           107           113           130           141           150           160           169           171           195           228           236           241           244           261           262           266           277           303           308           319           333           334           348           361           375           381           382           385           386           397           403           419           420           421           425           429           435           437           444           463           471           474           481           482           489           492           512           523           526           545           550           563           601           619           621           630           941           971
table(cbind.data.frame(cell.state.labels, cell.type.labels))
#>                  cell.type.labels
#> cell.state.labels endothelial myeloid oligo pericyte tcell tumor
#>     endothelial           492       0     0        0     0     0
#>     myeloid_0               0     550     0        0     0     0
#>     myeloid_1               0     526     0        0     0     0
#>     myeloid_2               0     386     0        0     0     0
#>     myeloid_3               0     382     0        0     0     0
#>     myeloid_4               0     266     0        0     0     0
#>     myeloid_5               0     141     0        0     0     0
#>     myeloid_6               0     130     0        0     0     0
#>     myeloid_7               0      75     0        0     0     0
#>     myeloid_8               0      49     0        0     0     0
#>     oligo                   0       0   160        0     0     0
#>     pericyte                0       0     0      489     0     0
#>     PJ016-tumor-0           0       0     0        0     0   621
#>     PJ016-tumor-1           0       0     0        0     0   619
#>     PJ016-tumor-2           0       0     0        0     0   481
#>     PJ016-tumor-3           0       0     0        0     0   420
#>     PJ016-tumor-4           0       0     0        0     0   375
#>     PJ016-tumor-5           0       0     0        0     0   308
#>     PJ016-tumor-6           0       0     0        0     0   261
#>     PJ017-tumor-0           0       0     0        0     0   107
#>     PJ017-tumor-1           0       0     0        0     0   107
#>     PJ017-tumor-2           0       0     0        0     0   101
#>     PJ017-tumor-3           0       0     0        0     0    89
#>     PJ017-tumor-4           0       0     0        0     0    83
#>     PJ017-tumor-5           0       0     0        0     0    64
#>     PJ017-tumor-6           0       0     0        0     0    22
#>     PJ018-tumor-0           0       0     0        0     0   444
#>     PJ018-tumor-1           0       0     0        0     0   429
#>     PJ018-tumor-2           0       0     0        0     0   403
#>     PJ018-tumor-3           0       0     0        0     0   361
#>     PJ018-tumor-4           0       0     0        0     0   262
#>     PJ018-tumor-5           0       0     0        0     0   113
#>     PJ025-tumor-0           0       0     0        0     0   971
#>     PJ025-tumor-1           0       0     0        0     0   941
#>     PJ025-tumor-2           0       0     0        0     0   630
#>     PJ025-tumor-3           0       0     0        0     0   601
#>     PJ025-tumor-4           0       0     0        0     0   523
#>     PJ025-tumor-5           0       0     0        0     0   421
#>     PJ025-tumor-6           0       0     0        0     0   397
#>     PJ025-tumor-7           0       0     0        0     0   381
#>     PJ025-tumor-8           0       0     0        0     0   319
#>     PJ025-tumor-9           0       0     0        0     0   236
#>     PJ030-tumor-0           0       0     0        0     0   563
#>     PJ030-tumor-1           0       0     0        0     0   482
#>     PJ030-tumor-2           0       0     0        0     0   425
#>     PJ030-tumor-3           0       0     0        0     0   419
#>     PJ030-tumor-4           0       0     0        0     0   348
#>     PJ030-tumor-5           0       0     0        0     0    73
#>     PJ032-tumor-0           0       0     0        0     0   195
#>     PJ032-tumor-1           0       0     0        0     0   171
#>     PJ032-tumor-2           0       0     0        0     0    72
#>     PJ032-tumor-3           0       0     0        0     0    62
#>     PJ032-tumor-4           0       0     0        0     0    57
#>     PJ032-tumor-5           0       0     0        0     0    41
#>     PJ035-tumor-0           0       0     0        0     0   512
#>     PJ035-tumor-1           0       0     0        0     0   474
#>     PJ035-tumor-2           0       0     0        0     0   471
#>     PJ035-tumor-3           0       0     0        0     0   435
#>     PJ035-tumor-4           0       0     0        0     0   385
#>     PJ035-tumor-5           0       0     0        0     0   334
#>     PJ035-tumor-6           0       0     0        0     0   241
#>     PJ035-tumor-7           0       0     0        0     0   150
#>     PJ035-tumor-8           0       0     0        0     0    81
#>     PJ048-tumor-0           0       0     0        0     0   545
#>     PJ048-tumor-1           0       0     0        0     0   463
#>     PJ048-tumor-2           0       0     0        0     0   437
#>     PJ048-tumor-3           0       0     0        0     0   333
#>     PJ048-tumor-4           0       0     0        0     0   303
#>     PJ048-tumor-5           0       0     0        0     0   277
#>     PJ048-tumor-6           0       0     0        0     0   244
#>     PJ048-tumor-7           0       0     0        0     0   228
#>     PJ048-tumor-8           0       0     0        0     0   169
#>     tcell                   0       0     0        0    67     0

QC of cell type and state labels

We recommend first plotting the pairwise correlation matrix between cell states and between cell types. This will give us a sense of their quality. In cases where cell types/states are not represented by sufficient amount of information (low cell count and/or low library size), the low-quality cell types/states tend to cluster together. Users may re-cluster the data at higher granularity, or merge those cell types/states with the most similar cell types/states, or remove them (if re-clustering and merging is not appropriate).

plot.cor.phi (input=sc.dat,
                         input.labels=cell.state.labels,
                         title="cell state correlation",
                         #specify pdf.prefix if need to output to pdf
                         #pdf.prefix="gbm.cor.cs", 
                         cexRow=0.2, cexCol=0.2,
                         margins=c(2,2))

plot.cor.phi (input=sc.dat, 
                         input.labels=cell.type.labels, 
                         title="cell type correlation",
                         #specify pdf.prefix if need to output to pdf
                         #pdf.prefix="gbm.cor.ct",
                         cexRow=0.5, cexCol=0.5,
                         )

Filter outlier genes

Gene expressed at high magnitude, such as ribosomal protein genes and mitochondrial genes, may dominate the distribution and bias the inference. These genes are often not informative in distinguishing cell types and can be a source of large spurious variance. As a result, they can be detrimental to deconvolution. We recommend the removal of these genes.

sc.stat <- plot.scRNA.outlier(
  input=sc.dat, #make sure the colnames are gene symbol or ENSMEBL ID 
  cell.type.labels=cell.type.labels,
  species="hs", #currently only human(hs) and mouse(mm) annotations are supported
  return.raw=TRUE #return the data used for plotting. 
  #pdf.prefix="gbm.sc.stat" specify pdf.prefix if need to output to pdf
)
#> EMSEMBLE IDs detected.

As shown by the plot, ribosomal protein genes often show high mean expression and low cell type specificity scores.

Users may also subset genes from sc.dat based on the statistics outputted by the function if needed.

head(sc.stat)  
#>                    exp.mean.log  max.spec other_Rb  chrM  chrX  chrY    Rb   Mrp   act    hb MALAT1
#> ENSG00000130876.10    -13.59828 0.8473029    FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  FALSE
#> ENSG00000134438.9     -16.08255 0.8914740    FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  FALSE
#> ENSG00000269696.1     -13.12002 0.7509754    FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  FALSE
#> ENSG00000230393.1     -13.97772 0.5654292    FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  FALSE
#> ENSG00000266744.1     -17.96174 0.4733715    FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  FALSE
#> ENSG00000202281.1     -18.42068 0.1666667    FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  FALSE

sc.stat shows the log of normalized mean expression (x-axis) and the maximum specificity (y-axis) of each gene, and if each gene belongs to a potential outlier category. other_Rb are a group of curated genes mostly consists of ribosomal psuedo-genes.

bk.stat <- plot.bulk.outlier(
  bulk.input=bk.dat,#make sure the colnames are gene symbol or ENSMEBL ID 
    sc.input=sc.dat, #make sure the colnames are gene symbol or ENSMEBL ID 
  cell.type.labels=cell.type.labels,
  species="hs", #currently only human(hs) and mouse(mm) annotations are supported
  return.raw=TRUE
  #pdf.prefix="gbm.bk.stat" specify pdf.prefix if need to output to pdf
)
#> EMSEMBLE IDs detected.

head(bk.stat)
#>                    exp.mean.log  max.spec other_Rb  chrM  chrX  chrY    Rb   Mrp   act    hb MALAT1
#> ENSG00000000003.13    -9.263877 0.4825322    FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  FALSE
#> ENSG00000000005.5    -13.336689 0.9736083    FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  FALSE
#> ENSG00000000419.11   -10.455993 0.2275788    FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  FALSE
#> ENSG00000000457.12   -11.298589 0.2169448    FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  FALSE
#> ENSG00000000460.15   -11.648474 0.2920256    FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  FALSE
#> ENSG00000000938.11   -11.152720 0.7347018    FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  FALSE

Next, we remove the genes from selected groups. Note that when sex is not identical between the reference and mixture, we recommend excluding genes from chrX and chrY. We also remove lowly transcribed genes, as the measurement of transcription of these genes tend to be noise-prone. Removal of these genes can also speed up computation.

sc.dat.filtered <- cleanup.genes (input=sc.dat,
                                  input.type="count.matrix",
                                    species="hs", 
                                    gene.group=c( "Rb","Mrp","other_Rb","chrM","MALAT1","chrX","chrY") ,
                                    exp.cells=5)
#> EMSEMBLE IDs detected.
#> number of genes filtered in each category: 
#>       Rb      Mrp other_Rb     chrM   MALAT1     chrX     chrY 
#>       89       78     1011       37        1     2464      594 
#> A total of  4214  genes from Rb Mrp other_Rb chrM MALAT1 chrX chrY  have been excluded 
#> A total of  24343  gene expressed in fewer than  5  cells have been excluded
dim(sc.dat.filtered)
#> [1] 23793 31737
#note this function only works for human data. For other species, you are advised to make plots by yourself.
plot.bulk.vs.sc (sc.input = sc.dat.filtered,
                            bulk.input = bk.dat
                            #pdf.prefix="gbm.bk.vs.sc" specify pdf.prefix if need to output to pdf
)
#> EMSEMBLE IDs detected.

We observe that protein coding genes are the most concordant group between two assays. To reduce batch effects and speed up computation, we perform deconvolution on protein coding genes. We have also tried to run BayesPrism on all genes. The results were similar.

sc.dat.filtered.pc <-  select.gene.type (sc.dat.filtered,
                                        gene.type = "protein_coding")
#> EMSEMBLE IDs detected.
#> number of genes retained in each category: 
#> 
#> protein_coding 
#>          16148

We provide a function for selecting genes by performing differential expression using pair-wise t-test between cell states from different cell types. Other differential expression analysis can also be used.


# Select marker genes (Optional)
# performing pair-wise t test for cell states from different cell types

diff.exp.stat <- get.exp.stat(sc.dat=sc.dat[,colSums(sc.dat>0)>3],# filter genes to reduce memory use
                                          cell.type.labels=cell.type.labels,
                                          cell.state.labels=cell.state.labels,
                                          psuedo.count=0.1, #a numeric value used for log2 transformation. =0.1 for 10x data, =10 for smart-seq. Default=0.1.
                                          cell.count.cutoff=50, # a numeric value to exclude cell state with number of cells fewer than this value for t test. Default=50.
                                          n.cores=1 #number of threads
                                          )
                                          

Ideally, we would like to have sufficient number of genes selected for each cell type (>50). If not, users may lower the cutoff.

To subset our count matrix over the signature genes, do

sc.dat.filtered.pc.sig <- select.marker (sc.dat=sc.dat.filtered.pc,
                                                  stat=diff.exp.stat,
                                                  pval.max=0.01,
                                                  lfc.min=0.1)
#> number of markers selected for each cell type: 
#> tumor :  8686 
#> myeloid :  575 
#> pericyte :  114 
#> endothelial :  244 
#> tcell :  123 
#> oligo :  86
dim(sc.dat.filtered.pc.sig)
#> [1] 23793  7874

To run BayesPrism using the signature genes, use reference=sc.dat.filtered.sig when call new.prism (see below).

Construct a prism object.

A prism object contains all data required for running BayesPrism, namely, a scRNA-seq reference matrix, the cell type and cell state labels of each row of reference, and the mixture matrix for bulk RNA-seq.

When using scRNA-seq count matrix as the input (recommended), user needs to specify input.type = "count.matrix". The other option for input.type is "GEP" (gene expression profile) which is a cell state by gene matrix. This option is used when using reference derived from other assays, such as sorted bulk data.

The parameter key is a character in cell.type.labels that corresponds to the malignant cell type. Set to NULL if there are no malignant cells or the malignant cells between reference and mixture are from matched samples, in which case all cell types will be treated equally.

myPrism <- new.prism(
  reference=sc.dat.filtered.pc, 
  mixture=bk.dat,
  input.type="count.matrix", 
  cell.type.labels = cell.type.labels, 
  cell.state.labels = cell.state.labels,
  key="tumor",
  outlier.cut=0.01,
    outlier.fraction=0.1,
)
#> number of cells in each cell state 
#> cell.state.labels
#> PJ017-tumor-6 PJ032-tumor-5     myeloid_8 PJ032-tumor-4 PJ032-tumor-3 PJ017-tumor-5         tcell PJ032-tumor-2 PJ030-tumor-5     myeloid_7 PJ035-tumor-8 PJ017-tumor-4 PJ017-tumor-3 PJ017-tumor-2 PJ017-tumor-0 PJ017-tumor-1 PJ018-tumor-5     myeloid_6     myeloid_5 PJ035-tumor-7         oligo PJ048-tumor-8 PJ032-tumor-1 PJ032-tumor-0 PJ048-tumor-7 PJ025-tumor-9 PJ035-tumor-6 PJ048-tumor-6 PJ016-tumor-6 PJ018-tumor-4     myeloid_4 PJ048-tumor-5 PJ048-tumor-4 PJ016-tumor-5 PJ025-tumor-8 PJ048-tumor-3 PJ035-tumor-5 PJ030-tumor-4 PJ018-tumor-3 PJ016-tumor-4 PJ025-tumor-7     myeloid_3 PJ035-tumor-4     myeloid_2 PJ025-tumor-6 PJ018-tumor-2 PJ030-tumor-3 PJ016-tumor-3 PJ025-tumor-5 PJ030-tumor-2 PJ018-tumor-1 PJ035-tumor-3 PJ048-tumor-2 PJ018-tumor-0 PJ048-tumor-1 PJ035-tumor-2 PJ035-tumor-1 PJ016-tumor-2 PJ030-tumor-1      pericyte   endothelial PJ035-tumor-0 PJ025-tumor-4     myeloid_1 PJ048-tumor-0     myeloid_0 PJ030-tumor-0 PJ025-tumor-3 PJ016-tumor-1 PJ016-tumor-0 PJ025-tumor-2 PJ025-tumor-1 PJ025-tumor-0 
#>            22            41            49            57            62            64            67            72            73            75            81            83            89           101           107           107           113           130           141           150           160           169           171           195           228           236           241           244           261           262           266           277           303           308           319           333           334           348           361           375           381           382           385           386           397           403           419           420           421           425           429           435           437           444           463           471           474           481           482           489           492           512           523           526           545           550           563           601           619           621           630           941           971 
#> Number of outlier genes filtered from mixture = 6 
#> Aligning reference and mixture... 
#> Nornalizing reference...

#Note that outlier.cut and outlier.fraction=0.1 filter genes in X whose expression fraction is greater than outlier.cut (Default=0.01) in more than outlier.fraction (Default=0.1) of bulk data. 
#Typically for dataset with reasonable quality control, very few genes will be filtered. 
#Removal of outlier genes will ensure that the inference will not be dominated by outliers, which sometimes may be resulted from poor QC in mapping.

Run BayesPrism.

Next, we start the run of BayesPrism.

Parameters to control Gibbs sampling and optimization can be specified using gibbs.control and opt.control. Do ?run.prism for details. We recommend the use of default parameters.

bp.res <- run.prism(prism = myPrism, n.cores=16)
#> Run Gibbs sampling... 
#> Current time:  2022-06-19 18:01:50 
#> Estimated time to complete:  2hrs 2mins 
#> Estimated finishing time:  2022-06-19 20:02:51 
#> Start run...
#> Warning in searchCommandline(parallel, cpus = cpus, type = type, socketHosts = socketHosts, : Unknown option on commandline: rmarkdown::render('/Users/chut/Desktop/Ted/R_package/BayesPrism_raw/old/vignettes/tutorial_deconvolution.Rmd',~+~~+~encoding~+~
#> R Version:  R version 4.2.0 beta (2022-04-13 r82166)
#> snowfall 1.84-6.1 initialized (using snow 0.4-4): parallel execution on 16 CPUs.
#> 
#> Stopping cluster
#> Update the reference matrix ...
#> Warning in searchCommandline(parallel, cpus = cpus, type = type, socketHosts = socketHosts, : Unknown option on commandline: rmarkdown::render('/Users/chut/Desktop/Ted/R_package/BayesPrism_raw/old/vignettes/tutorial_deconvolution.Rmd',~+~~+~encoding~+~
#> snowfall 1.84-6.1 initialized (using snow 0.4-4): parallel execution on 16 CPUs.
#> 
#> 
#> Stopping cluster
#> Run Gibbs sampling using updated reference ... 
#> Current time:  2022-06-19 20:16:07 
#> Estimated time to complete:  34mins 
#> Estimated finishing time:  2022-06-19 20:50:07 
#> Start run...
#> Warning in searchCommandline(parallel, cpus = cpus, type = type, socketHosts = socketHosts, : Unknown option on commandline: rmarkdown::render('/Users/chut/Desktop/Ted/R_package/BayesPrism_raw/old/vignettes/tutorial_deconvolution.Rmd',~+~~+~encoding~+~
#> snowfall 1.84-6.1 initialized (using snow 0.4-4): parallel execution on 16 CPUs.
#> 
#> 
#> Stopping cluster

Report the summary statistics.

bp.res
#> Input prism info: 
#> Cell states in each cell type: 
#> $tumor
#>  [1] "PJ016-tumor-0" "PJ016-tumor-3" "PJ016-tumor-2" "PJ016-tumor-6" "PJ016-tumor-4" "PJ016-tumor-1" "PJ016-tumor-5" "PJ017-tumor-3" "PJ017-tumor-2" "PJ017-tumor-1" "PJ017-tumor-0" "PJ017-tumor-4" "PJ017-tumor-5" "PJ017-tumor-6" "PJ018-tumor-4" "PJ018-tumor-2" "PJ018-tumor-3" "PJ018-tumor-1" "PJ018-tumor-0" "PJ018-tumor-5" "PJ025-tumor-9" "PJ025-tumor-2" "PJ025-tumor-4" "PJ025-tumor-3" "PJ025-tumor-0" "PJ025-tumor-1" "PJ025-tumor-8" "PJ025-tumor-7" "PJ025-tumor-5" "PJ025-tumor-6" "PJ030-tumor-4" "PJ030-tumor-0" "PJ030-tumor-1" "PJ030-tumor-5" "PJ030-tumor-3" "PJ030-tumor-2" "PJ032-tumor-0" "PJ032-tumor-1" "PJ032-tumor-5" "PJ032-tumor-2" "PJ032-tumor-4" "PJ032-tumor-3" "PJ035-tumor-2" "PJ035-tumor-3" "PJ035-tumor-6" "PJ035-tumor-1" "PJ035-tumor-0" "PJ035-tumor-4" "PJ035-tumor-5" "PJ035-tumor-8" "PJ035-tumor-7" "PJ048-tumor-0" "PJ048-tumor-3" "PJ048-tumor-4" "PJ048-tumor-2" "PJ048-tumor-6" "PJ048-tumor-1" "PJ048-tumor-8" "PJ048-tumor-7" "PJ048-tumor-5"
#> 
#> $myeloid
#> [1] "myeloid_2" "myeloid_5" "myeloid_3" "myeloid_7" "myeloid_0" "myeloid_6" "myeloid_4" "myeloid_1" "myeloid_8"
#> 
#> $pericyte
#> [1] "pericyte"
#> 
#> $endothelial
#> [1] "endothelial"
#> 
#> $tcell
#> [1] "tcell"
#> 
#> $oligo
#> [1] "oligo"
#> 
#> 
#> Identifier of the malignant cell type:  tumor 
#> Number of cell states:  73 
#> Number of cell types:  6 
#> Number of mixtures:  169 
#> Number of genes:  16145 
#> 
#> Initial cell type fractions: 
#>         tumor myeloid pericyte endothelial tcell oligo
#> Min.    0.199   0.004    0.000       0.015  0.00 0.000
#> 1st Qu. 0.702   0.051    0.014       0.036  0.00 0.010
#> Median  0.775   0.102    0.025       0.046  0.00 0.025
#> Mean    0.758   0.109    0.043       0.048  0.00 0.041
#> 3rd Qu. 0.848   0.145    0.048       0.060  0.00 0.051
#> Max.    0.952   0.578    0.503       0.133  0.01 0.329
#> Updated cell type fractions: 
#>         tumor myeloid pericyte endothelial tcell oligo
#> Min.    0.272   0.000    0.000       0.008 0.000 0.000
#> 1st Qu. 0.751   0.046    0.006       0.027 0.000 0.002
#> Median  0.816   0.090    0.014       0.039 0.000 0.011
#> Mean    0.801   0.098    0.030       0.041 0.001 0.029
#> 3rd Qu. 0.878   0.130    0.033       0.054 0.001 0.036
#> Max.    0.968   0.565    0.385       0.134 0.010 0.307

Alternatively, if user would like to run step 4 separately from 1-3, one may do the following:

# not run
bp.res.initial <- run.prism(prism = myPrism, n.cores=50, update.gibbs=FALSE)
    
bp.res.update <- update.theta (bp = bp.res.initial, optimizer="EB")

Extract results

The output from bp.res is an S4 object of the class "BayesPrism". Let’s take a look at what’s inside:

slotNames(bp.res)
#> [1] "prism"                       "posterior.initial.cellState" "posterior.initial.cellType"  "reference.update"            "posterior.theta_f"           "control_param"

We provide utility functions to extract deconvolved cell type fraction and expression from the output of run.prism.

# extract posterior mean of cell type fraction theta
theta <- get.fraction (bp=bp.res,
            which.theta="final",
            state.or.type="type")

head(theta)
#>                                  tumor    myeloid     pericyte endothelial        tcell        oligo
#> TCGA-06-2563-01A-01R-1849-01 0.8392322 0.04330542 2.996674e-02  0.07530695 6.434539e-04 0.0115452746
#> TCGA-06-0749-01A-01R-1849-01 0.7089897 0.17000434 9.898055e-10  0.01279814 6.124492e-09 0.1082078337
#> TCGA-06-5418-01A-01R-1849-01 0.8625820 0.09838973 9.722822e-03  0.02412884 5.628391e-09 0.0051766141
#> TCGA-06-0211-01B-01R-1849-01 0.8893832 0.04482222 1.132602e-02  0.05429633 1.878798e-07 0.0001720069
#> TCGA-19-2625-01A-01R-1850-01 0.9406359 0.03546978 1.963802e-03  0.01310674 1.501959e-10 0.0088238247
#> TCGA-19-4065-02A-11R-2005-01 0.6763201 0.08372637 1.849597e-01  0.01923162 3.673467e-04 0.0353948735
# extract coefficient of variation (CV) of cell type fraction
theta.cv <- bp.res@posterior.theta_f@theta.cv

head(theta.cv)
#>                                     tumor      myeloid     pericyte endothelial       tcell       oligo
#> TCGA-06-2563-01A-01R-1849-01 0.0001609700 0.0018436677 0.0027628184 0.001288848  0.04117159 0.004587287
#> TCGA-06-0749-01A-01R-1849-01 0.0002334131 0.0006803785 7.0554083932 0.005860777  7.71042236 0.001195068
#> TCGA-06-5418-01A-01R-1849-01 0.0001532302 0.0009872300 0.0063176352 0.003199483  5.60222139 0.008825872
#> TCGA-06-0211-01B-01R-1849-01 0.0001489711 0.0017054370 0.0057181930 0.001854729  2.28000904 0.188656805
#> TCGA-19-2625-01A-01R-1850-01 0.0001368551 0.0022153427 0.0146296308 0.004919253 11.90217902 0.007811778
#> TCGA-19-4065-02A-11R-2005-01 0.0003117102 0.0013605790 0.0009039545 0.004405931  0.05617673 0.002510947
# extract posterior mean of cell type-specific gene expression count matrix Z  
Z.tumor <- get.exp (bp=bp.res,
                          state.or.type="type",
                          cell.name="tumor")

head(t(Z.tumor[1:5,]))
#>                    TCGA-06-2563-01A-01R-1849-01 TCGA-06-0749-01A-01R-1849-01 TCGA-06-5418-01A-01R-1849-01 TCGA-06-0211-01B-01R-1849-01 TCGA-19-2625-01A-01R-1850-01
#> ENSG00000130876.10                       55.988                      444.964                        8.996                       56.992                       16.000
#> ENSG00000165244.6                       204.952                       37.680                      291.788                      531.084                      507.344
#> ENSG00000173597.7                        17.896                        6.564                       21.908                       28.748                       10.792
#> ENSG00000158022.6                         0.000                        0.408                        2.624                        0.956                        7.360
#> ENSG00000167220.10                     2275.204                     1061.040                     2774.508                     2365.868                     2084.588
#> ENSG00000126106.12                      677.904                      687.636                      480.912                      699.740                     1295.996
#save the result
save(bp.res, file="bp.res.rdata")

Downstream analysis

Potential downstream analysis can be performed using theta and Z are: